# 开发作者: HeBugui
# 开发时间 :2022/3/20 14:29
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from wordCloud import word_cloud
from time import time
start = time()
def count_time():
    minutes = round((time() - start), 2)
    print("程序运行了" + str(minutes) + "秒")

problem_log = pd.read_csv("junyi_data/problemlog/junyi_ProblemLog_original_part_10w.csv")  # 部分数据
exercise = pd.read_csv("junyi_data/junyi_Exercise_table.csv")
exercise_correct = pd.read_csv("junyi_data/problemlog/exercise_correct.csv")
student_correct = pd.read_csv('junyi_data/problemlog/student_correct.csv')

#词云图路径
path = "static/assets/img/"
path2 = "static/assets/img/"

#基本信息
def base_info():
    print("*"*100,"基本信息")
    print(problem_log.info())
    print(problem_log.head())
    print(problem_log.describe())


#统计某个学生的做题记录数
def demo():
    user = problem_log[problem_log["user_id"] == 12884]
    user_count = problem_log["user_id"].value_counts()

#统计做题数较多的学生id
def number_count():
    print("*"*100,"分类统计user_id")
    number_count = problem_log["user_id"].value_counts()
    number_count.to_csv("junyi_data/problemlog/number_count.csv")
    print(number_count[:12].index.tolist())
    return number_count[:10]

#分析正确率最低的题目和area
def mini_exercise():
    exercise_correct = problem_log[['exercise','correct']]
    #新增一列
    right = []
    for i in range(len(exercise_correct)):
        # print(type(exercise_correct['correct'][i]),exercise_correct['correct'][i])
        if exercise_correct['correct'][i]:
            right.append(1)
        else:
            right.append(0)
    dataframe = pd.DataFrame({'a_name':right})
    exercise_correct.insert(2,'right',dataframe)
    exercise_correct = exercise_correct.drop(['correct'],axis=1)
    gb = exercise_correct.groupby(by = ['exercise','right'])['right'].agg([('right',np.sum)])
    gb.to_csv("junyi_data/problemlog_gb.csv")



#哪些学生做题数量最多
def excellent_student():
    # 返回series类型
    excellent_stu = problem_log.groupby('user_id').size().sort_values(ascending=False)[:20]
    return excellent_stu


#数据预处理
def init_dataset():
    # 两表连接,删除不必要的列
    exercise2 = exercise[['name','short_display_name','topic','area']]
    problem_log3 = problem_log[['user_id','exercise','time_taken','points_earned']]
    problemlog_exercise = pd.merge(problem_log3, exercise2, left_on='exercise',right_on='name')
    problemlog_exercise.drop(['name'],axis=1,inplace=True)
    problemlog_exercise.rename(columns={'user_id':'学号','exercise':'所做练习名','time_taken':'花费时间（分）','points_earned':'获得积分数','short_display_name':'中文名称','topic':'话题','area':'领域'},inplace=True)
    problemlog_exercise.to_csv("junyi_data/problemlog_exercise.csv")
    return problemlog_exercise


#做题数量最多学生刻画印象
def most_student(problemlog_exercise,student_id = 88703):
    student_dataset = problemlog_exercise[problemlog_exercise['学号'] == student_id]
    # 提取词云字符串
    cn_name = list(student_dataset['中文名称'])
    topic_info = list(student_dataset['话题'])
    area_info = list(student_dataset['领域'])
    cn_str = topic_area_str = ""#初始化为空
    for i in range(len(cn_name)):
        cn_str = cn_str + " " + cn_name[i]
        topic_area_str = topic_area_str + " " + topic_info[i] + " " + area_info[i]

    # 绘制词云图
    title = str(student_id)
    word_cloud(cn_str, path, title + '最喜欢的题目')
    word_cloud(topic_area_str, path2, title + '最喜欢的领域及话题')

#对做题数和正确率（积分）进行聚类分析
def cluster(problem_exercise):
    count_num = problem_exercise.groupby(by=['学号']).size()#默认by升序
    count_points = problem_exercise.groupby(by=['学号'])['获得积分数'].agg([('获得积分数',np.sum)])
    #series转换成dataframe
    dict_num = {'学号': count_num.index, '做题数量': count_num.values}
    df_num = pd.DataFrame(dict_num)
    #合并df
    num_points = pd.merge(df_num,count_points,left_on='学号',right_on=count_points.index)
    return num_points

#绘制散点图
def cluster_draw(count_num,count_points):
    plt.rcParams['font.family'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    title = '获得积分与做题数量的关系'
    plt.title(title)
    plt.xlabel('做题数量')
    plt.ylabel('获得积分')
    x = count_num
    y = count_points
    plt.scatter(x,y)
    plt.savefig('images/获得积分与做题数量的关系.jpg')
    plt.show()

#学生最喜欢的题目类型（领域话题&名称）
def alllike_topic_area(problem_exercise):
    alllike_topic = problem_exercise.groupby(by='话题').size().sort_values(ascending=False)[:7]
    list_topic = []
    for i in range(len(alllike_topic)):
        dict = {}
        dict['name'] = alllike_topic.index[i]
        dict['value'] = alllike_topic[i]
        list_topic.append(dict)
    alllike_area = problem_exercise.groupby(by='领域').size().sort_values(ascending=False)[:7]
    list_area = []
    for i in range(len(alllike_area)):
        dict = {}
        dict['name'] = alllike_area.index[i]
        dict['value'] = alllike_area[i]
        list_area.append(dict)
    alllike_exercise_name = problem_exercise.groupby(by='所做练习名').size().sort_values(ascending=False)[:7]
    list_exercise_name = []
    for i in range(len(alllike_exercise_name)):
        dict = {}
        dict['name'] = alllike_exercise_name.index[i]
        dict['value'] = alllike_exercise_name[i]
        list_exercise_name.append(dict)
    print(alllike_topic,alllike_area,alllike_exercise_name)
    return list_topic,list_area,list_exercise_name

#Percent of answer correctly at first attempt

#Percent of mastered exercises
def count_master_percentage():
    total = len(problem_log)
    mastered = 0
    for i in range(total):
        if problem_log['points_earned'][i] > 0:
            mastered = mastered + 1
    # print(mastered,total)
    return mastered,total

#基于用户的协同过滤推荐算法
def recommendation_student(problem_exercise,student_id=88703):
    #候选学生列表
    excellent_student_list = excellent_student().index.tolist()
    #题目列表
    student_exercise1 = list(problem_exercise[problem_exercise['学号'] == student_id]['所做练习名'])
    #题目长度
    len_exercise1 = len(student_exercise1)
    max_similarity = 0; rec_index = 0; rec_difference = [];

    #两个列表做交集运算
    for i in range(len(excellent_student_list)):
        if excellent_student_list[i] == student_id:#从候选学生列表中去除本人
            continue
        student_exercise2 = list(problem_exercise[problem_exercise['学号'] == excellent_student_list[i]]['所做练习名'])
        intersection = [i for i in student_exercise1 if i in student_exercise2]#列表的交运算，用来求相似度
        difference = [i for i in student_exercise2 if i not in student_exercise1]#列表的差运算，用来做推荐
        len_exercise2 = len(intersection)
        similarity = round(len_exercise2/len_exercise1,2)
        if similarity>max_similarity:
            max_similarity = similarity
            rec_index = i
            rec_difference = difference

        rec_difference = list(set(rec_difference))#推荐题目去重

    return max_similarity,excellent_student_list[rec_index],rec_difference_info(rec_difference),student_id

#处理rec_difference
def rec_difference_info(rec_difference):
    rec_list = []

    for i in rec_difference:
        dict1 = {'name':'','name_cn':'','area':'','topic':'','diff':0,'correct_rate':0}
        dict1['name'] = i
        dict1['name_cn'] = list(exercise[exercise['name'] == i]['pretty_display_name'])[0]
        dict1['area'] = list(exercise[exercise['name'] == i]['area'])[0]
        dict1['topic'] = list(exercise[exercise['name'] == i]['topic'])[0]
        #计算难度，难度为（1-正确率）取小数
        # print(i)
        len1 = len(list(exercise_correct[exercise_correct['exercise'] == i]['diff']))
        if len1 < 1:
            dict1['correct_rate'] = 0.000
            dict1['diff'] = 0.000
            print('题目正确率不详')
        else:
            dict1['correct_rate'] = list(exercise_correct[exercise_correct['exercise'] == i]['diff'])[0]
            dict1['diff'] = round(1-dict1['correct_rate'],2)

        rec_list.append(dict1)
    return rec_list

#题目难度系数计算，即正确率
def exercise_diff_count():
    diffs = []

    gb = problem_log.groupby(by=['exercise','correct']).size()
    # print(type(gb),gb[0])
    # print(gb.index)
    #存在正确率为100%或0%练习
    i=0
    while i < len(gb):
        diff = {'exercise': '', 'diff': 0, 'num': 0, 'correct_true': 0, 'correct_false': 0}
        if(gb.index[i][0] == gb.index[i+1][0]):#正确率介于0，1
            diff['exercise'] = gb.index[i][0]
            diff['diff'] = round(gb[i + 1] / (gb[i] + gb[i + 1]),2)
            diff['num'] = gb[i] + gb[i + 1]
            diff['correct_true'] = gb[i + 1]
            diff['correct_false'] = gb[i]
            i = i+2
        else:#正确率为1或0
            diff['exercise'] = gb.index[i][0]
            if gb.index[i][1] == True:#为1
                diff['diff'] = 1
                diff['num'] = gb[i]
                diff['correct_true'] = gb[i]
                diff['correct_false'] = 0
            else:#为0
                diff['diff'] = 0
                diff['num'] = gb[i]
                diff['correct_true'] = 0
                diff['correct_false'] = gb[i]
            i = i+1
        # print(diff)
        diffs.append(diff)

    # print(len(diffs))
    exercise_correct = pd.DataFrame(diffs)
    # print(exercise_correct)
    exercise_correct.to_csv('junyi_data/problemlog/exercise_correct.csv')
    # print(exercise_correct['num'].sum())

#计算学生做题的正确率，取前100个学生
def student_exercise_correct_rate_count():
    gb = problem_log.groupby(by=['user_id','correct']).size()
    # print(gb)
    students_correct = []
    i = 0
    # print(gb[gb.index[i]])
    # print(gb.values[0])
    while i < len(gb)-1:
        student_correct = {'user_id':0,'num':0,'correct_true':0,'correct_false':0,'correct_rate':0}
        if(gb.index[i][0] == gb.index[i+1][0]):#正确率介于0，1
            student_correct['user_id'] = gb.index[i][0]
            student_correct['num'] = gb[gb.index[i]]+gb[gb.index[i+1]]
            student_correct['correct_true'] = gb[gb.index[i+1]]
            student_correct['correct_false'] = gb[gb.index[i]]
            student_correct['correct_rate'] = round(student_correct['correct_true']/(student_correct['num']),2)
            i = i + 2
        else:#正确率为0或1
            student_correct['user_id'] = gb.index[i][0]
            if gb.index[i][1] == True:#为1
                student_correct['num'] = gb.values[i]
                student_correct['correct_true'] = gb.values[i]
                student_correct['correct_false'] = 0
                student_correct['correct_rate'] = 1
            else:#为0
                student_correct['num'] = gb.values[i]
                student_correct['correct_true'] = 0
                student_correct['correct_false'] = gb.values[i]
                student_correct['correct_rate'] = 0
            i = i + 1
        # print(student_correct)
        students_correct.append(student_correct)
    students_correct = pd.DataFrame(students_correct)
    students_correct.to_csv('junyi_data/problemlog/student_correct.csv')



if __name__=='__main__':
    problemlog_exercise = init_dataset()
    # exercise_diff_count()
    # student_exercise_correct_rate_count()
    # print(student_correct['num'].describe())

    print('over')
